library(caret)
library(dplyr)
library(mltools)
library(rpart)
library(isotree)
library(plotly)
Let’s check if there are anomalies in the dataset “PenguinsWithoutMissingValues.csv”. We want to use Isolation Forests for it since they have some advantages compared with other anomaly detection techniques.
# import the data PenguinsWithoutMissingValues
setwd("C:/Users/Dari-Laptop/Desktop/FH Karnten - Master - AppDs/StatisticsAppDSLaptop")
data = read.csv("PenguinsWithoutMissingValues.csv")
# encode categorical data
data$Species = factor(data$Species)
data$Island = factor(data$Island)
data$Gender = factor(data$Gender)
# predict the average depth
model <- isolation.forest(data, ndim=1, ntrees=10, nthreads=1)
scores <- predict(model, data, type="avg_depth")
par(mar = c(4,5,3,2))
plot(data$CulmenLength.mm., scores, type="p", col="darkred",
main="Average isolation depth\nfor CulmenLength(mm)",
xlab="value", ylab="Average isolation depth")
plot(data$CulmenDepth.mm., scores, type="p", col="darkred",
main="Average isolation depth\nfor CulmenDepth(mm)",
xlab="value", ylab="Average isolation depth")
plot(data$FlipperLength.mm., scores, type="p", col="darkred",
main="Average isolation depth\nfor FlipperLength(mm)",
xlab="value", ylab="Average isolation depth")
plot(data$BodyMass.g., scores, type="p", col="darkred",
main="Average isolation depth\nfor BodyMass(g)",
xlab="value", ylab="Average isolation depth")
data$score <- predict(model, newdata = data)
data$score
## [1] 0.4578364 0.4919289 0.4717983 0.4634670 0.4627224 0.4580191 0.4888889
## [8] 0.4884262 0.4743137 0.5400898 0.4434475 0.4787488 0.5189226 0.4837096
## [15] 0.5121948 0.4851209 0.4725081 0.4649658 0.4844198 0.5596001 0.4406936
## [22] 0.4405415 0.4383124 0.4780684 0.4598501 0.4964540 0.4992688 0.4405060
## [29] 0.4340397 0.4201672 0.4951317 0.4360031 0.5190058 0.4767544 0.4474137
## [36] 0.4330214 0.4468157 0.4337378 0.4930378 0.4235189 0.4284610 0.4719479
## [43] 0.4210829 0.5073663 0.4520723 0.4215195 0.4433411 0.4537013 0.4715384
## [50] 0.4319524 0.4297998 0.4514957 0.4636074 0.4311626 0.4337975 0.4782678
## [57] 0.4326671 0.4496893 0.4795662 0.4594558 0.4979144 0.4140480 0.4647373
## [64] 0.4723818 0.5398571 0.4235979 0.4978067 0.4882456 0.4845802 0.4628337
## [71] 0.4973875 0.4604293 0.4927327 0.4952524 0.5204366 0.5135453 0.4602299
## [78] 0.5069460 0.4090961 0.5136877 0.4746397 0.4301833 0.4413656 0.4618874
## [85] 0.4636738 0.5037086 0.4647442 0.4312264 0.4085254 0.5331339 0.4362219
## [92] 0.4297965 0.5557771 0.4530897 0.4452467 0.4896844 0.5011017 0.4357118
## [99] 0.4354940 0.4440621 0.4647401 0.4451882 0.4394154 0.4668099 0.4938844
## [106] 0.4916919 0.4318985 0.4317359 0.5247426 0.4345292 0.5165276 0.4808505
## [113] 0.4688223 0.4716961 0.4772260 0.5064542 0.5044225 0.4739909 0.4903396
## [120] 0.4496964 0.4434714 0.4472747 0.4665368 0.5448095 0.4436527 0.4878292
## [127] 0.4668022 0.5048415 0.4036845 0.4560588 0.4433422 0.4855180 0.4421209
## [134] 0.4261492 0.4365401 0.4723759 0.6064683 0.4877523 0.4379296 0.4284479
## [141] 0.4225492 0.4285533 0.4236560 0.4859622 0.4543080 0.4745389 0.4323452
## [148] 0.4320515 0.4379103 0.4658032 0.4788382 0.4574619 0.4626743 0.4258793
## [155] 0.5110611 0.4583544 0.4403821 0.4434096 0.4321954 0.4296062 0.4306709
## [162] 0.4390258 0.4992455 0.5432149 0.4451476 0.4910207 0.4799401 0.5033391
## [169] 0.4974272 0.4560530 0.4284908 0.4373445 0.4507294 0.4279020 0.4212556
## [176] 0.4914383 0.4588224 0.5181932 0.4845815 0.4327766 0.4274348 0.4964224
## [183] 0.4250045 0.5459582 0.5683148 0.5267883 0.4386547 0.4266698 0.4514697
## [190] 0.4513407 0.4369203 0.4510576 0.4422928 0.4955650 0.5747865 0.4659029
## [197] 0.5480835 0.4254013 0.4649305 0.4645948 0.4655877 0.4683809 0.4682885
## [204] 0.4547470 0.4325063 0.4777032 0.4222985 0.4706933 0.4398649 0.4833682
## [211] 0.4925572 0.4290702 0.4414542 0.4363800 0.4597009 0.4358379 0.4423925
## [218] 0.4385070 0.4316842 0.4987000 0.4195007 0.4386651 0.4602693 0.4642384
## [225] 0.5054918 0.3946461 0.4512227 0.5001638 0.4290825 0.4226430 0.5115130
## [232] 0.4991299 0.4261074 0.4399429 0.4451592 0.4102558 0.4305178 0.4415535
## [239] 0.4727690 0.4651656 0.4214929 0.4714859 0.4435263 0.4442092 0.4426693
## [246] 0.4282673 0.5426833 0.4819088 0.3924836 0.4456695 0.4778104 0.4710584
## [253] 0.4986112 0.4831991 0.4055162 0.4206496 0.4767074 0.3972894 0.4457665
## [260] 0.4625595 0.3951124 0.4773757 0.4727027 0.4148229 0.4505772 0.4155020
## [267] 0.4670992 0.4073763 0.5061868 0.4700047 0.4522723 0.4224750 0.4476955
## [274] 0.4179520 0.4891470 0.4351705 0.4778159 0.4030806 0.4757101 0.4163724
## [281] 0.4904007 0.4367893 0.4354727 0.4595978 0.4525370 0.4510837 0.4123429
## [288] 0.4545669 0.4507523 0.4258496 0.4081464 0.4612861 0.4493849 0.4271062
## [295] 0.4212851 0.3951124 0.4203855 0.5032602 0.4626600 0.4432540 0.4101284
## [302] 0.4983920 0.4362031 0.4936398 0.4168934 0.4409982 0.4175788 0.4243844
## [309] 0.4540306 0.4900495 0.4826956 0.4297941 0.4452357 0.5149059 0.4975624
## [316] 0.5031785 0.4391660 0.4507518 0.4743276 0.4432246 0.4998094 0.4796001
## [323] 0.4441266 0.4930681 0.4594674 0.4410230 0.5290099 0.4098169 0.4530819
## [330] 0.4191695 0.4220659 0.4480337 0.4637000
# use ggplotly scatterplot to visualize the anomaly scores
# and show the IndividualIDs in the hoover text
p = ggplot(data,
aes(x=score, y = 1:nrow(data), color=Species,
text = paste("IndividualID :", IndividualID))) +
geom_point() +
labs(title="Anomaly scores",
x ="Score", y = "Number")
p
ggplotly(p)